The data contains features extracted from the silhouette of vehicles in different angles. Four "Corgie" model vehicles were used for the experiment: a double decker bus, Cheverolet van, Saab 9000 and an Opel Manta 400 cars. This particular combination of vehicles was chosen with the expectation that the bus, van and either one of the cars would be readily distinguishable, but it would be more difficult to distinguish between the cars.
Object recognition
The purpose is to classify a given silhouette as one of three types of vehicle, using a set of features extracted from the silhouette. The vehicle may be viewed from one of many different angles.
● All the features are geometric features extracted from the silhouette.
● All are numeric in nature.
Apply dimensionality reduction technique – PCA and train a model using principle components instead of training the model using just the raw data.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler
# Modelling
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.svm import SVC
from sklearn.decomposition import PCA
import pandas_profiling
#Load Data
df=pd.read_csv("vehicle.csv")
#Read first 5 rows
df.head()
print ("Rows : " ,df.shape[0])
print ("Columns : " ,df.shape[1])
print ("\nFeatures : \n" ,df.columns.tolist())
print ("\nUnique values : \n",df.nunique())
print("\n DataType : \n",df.dtypes)
## get the generic information of the data
df.info()
#Checking missing values in dataframe
df.isna().sum()
# Observation : `compactness`, `max.length_aspect_ratio`, `max.length_rectangularity`, `hollows_ratio`, `class`
# has no missing values, rest all features have missing values.
# Five point Summary
df.describe().T
#Display Data with missing value
null_data = df[df.isnull().any(axis=1)]
null_data
# Replace missing values with Median
missing_values=df.columns[df.isnull().any()]
for column in missing_values:
df[column] = df[column].fillna(df[column].median()).astype(df[column].dtype)
df.info()
#Checking missing values in dataframe
df.isna().sum()
# Five point Summary
df.describe().T
columns=['compactness', 'circularity', 'distance_circularity', 'radius_ratio', 'pr.axis_aspect_ratio', 'max.length_aspect_ratio', 'scatter_ratio', 'elongatedness', 'pr.axis_rectangularity', 'max.length_rectangularity', 'scaled_variance', 'scaled_variance.1', 'scaled_radius_of_gyration', 'scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1', 'skewness_about.2', 'hollows_ratio']
fig, axes = plt.subplots(nrows=6, ncols=3,figsize=(10,20))
colors = {0:'g', 1:'b', 2:'r'}
for i, column in enumerate(columns):
sns.distplot(df[column],ax=axes[i//3,i%3],color=colors[i%3])
plt.tight_layout()
#BoxPlot to show outliers
fig = plt.figure(figsize = (15, 8))
ax = sns.boxplot(data = df.iloc[:, 0:18], orient = 'h')
# `radius_ratio`, `pr.axis_aspect_ratio`, `max.length_aspect_ratio`, `scaled_variance`,
# `scaled_variance.1`, `scaled_radius_of_gyration.1`, `skewness_about`, `skewness_about.1` columns has outliers.
##Calculating the quantiles and iqr range
q1 = df.quantile(0.25)
q3 = df.quantile(0.75)
iqr = q3 - q1
##Excluding the outliers
df = df[~((df<(q1-1.5*iqr))|(df>(q3+1.5*iqr))).any(axis=1)]
df.count()
columns=['compactness', 'circularity', 'distance_circularity', 'radius_ratio', 'pr.axis_aspect_ratio', 'max.length_aspect_ratio', 'scatter_ratio', 'elongatedness', 'pr.axis_rectangularity', 'max.length_rectangularity', 'scaled_variance', 'scaled_variance.1', 'scaled_radius_of_gyration', 'scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1', 'skewness_about.2', 'hollows_ratio']
fig, axes = plt.subplots(nrows=6, ncols=3,figsize=(10,20))
colors = {0:'g', 1:'b', 2:'r'}
for i, column in enumerate(columns):
sns.distplot(df[column],ax=axes[i//3,i%3],color=colors[i%3])
plt.tight_layout()
f, ax = plt.subplots(figsize=(20, 15))
sns.heatmap(df.corr(), annot=True,linewidths=0.5,cmap='coolwarm')
sns.pairplot(df,diag_kind='kde')
# Most of the attributes show strong correlation(negatively or positively). Though multicollinearity exists between
# columns, We will let PCA do dimension reduction instead of removing the columns.
# Distribution of target variable
df['class'].value_counts()
df['class'].value_counts(normalize = True)*100
pd.value_counts(df["class"]).plot(kind="bar",colormap="Accent")
X = df.drop("class" , axis=1)
y = df.pop("class")
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
print('Original number of features:', X.shape[1])
# SVC without HyperParameter Tuning
model = SVC()
skf = StratifiedKFold(n_splits = 10)
model.fit(X_train, y_train)
## Cross Validation Score
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X=X_train, y=y_train, cv = skf)
print("\nCross Validation Score:",round(scores.mean(), 2).astype(str))
#Prediction
y_pred=model.predict(X_test)
#evaluation(Accuracy)
print("Test Accuracy:", accuracy_score(y_test,y_pred).round(2))
# Classification Report
print('\n{}'.format(classification_report(y_test,y_pred)))
#evaluation(Confusion Metrix)
print("Confusion Matrix:\n", confusion_matrix(y_test,y_pred))
# SVC with HyperParameter Tuning
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
classifier = SVC()
#Hyper Parameters Set
params = {'C': [0.01,0.05,0.5,1], 'kernel': ['linear','rbf']}
skf = StratifiedKFold(n_splits = 10)
model = GridSearchCV(classifier, param_grid=params, n_jobs=10, cv = skf)
model.fit(X_train, y_train)
#The best hyper parameters set
print("Best Hyper Parameters:\n",model.best_params_)
print('\nTrain Accuracy: {0:.2f}'.format(model.score(X_train, y_train)))
## Cross Validation Score
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X=X_train, y=y_train, cv = skf)
print("\nCross Validation Score:",round(scores.mean(), 2).astype(str))
#Prediction
y_pred=model.predict(X_test)
#evaluation(Accuracy)
print("Test Accuracy:", accuracy_score(y_test,y_pred).round(2))
# Classification Report
print('\n{}'.format(classification_report(y_test,y_pred)))
#evaluation(Confusion Metrix)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# Apply Principal Components that capture about 95% of the variance in the data
pca = PCA(n_components=0.95)
X_train = pca.fit_transform(X_train)
X_test = pca.fit_transform(X_test)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
print('Reduced number of features:', X_train.shape[1])
# Pairplot after dimension reduction
sns.pairplot(pd.DataFrame(X_train), diag_kind = 'kde')
print(pca.components_)
print(pca.explained_variance_)
print(pca.explained_variance_ratio_)
plt.step(list(range(1,8)),np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Cum of variation explained')
plt.xlabel('eigen Value')
plt.show()
#SVC using PCA
classifier = SVC()
#Hyper Parameters Set
params = {'C': [0.01,0.05,0.5,1], 'kernel': ['linear','rbf']}
skf = StratifiedKFold(n_splits = 10)
model = GridSearchCV(classifier, param_grid=params, n_jobs=10, cv = skf)
model.fit(X_train, y_train)
#The best hyper parameters set
print("Best Hyper Parameters:\n",model.best_params_)
print('\nTrain Accuracy:',model.score(X_train, y_train).round(2))
## Cross Validation Score
from sklearn.model_selection import cross_val_score
classifier = SVC(C=1, kernel="rbf")
scores = cross_val_score(classifier, X=X_train, y=y_train, cv = skf)
print("\nCross Validation Score:",scores.mean().round(2))
#Prediction
y_pred=model.predict(X_test)
#evaluation(Accuracy)
print("Test Accuracy:", accuracy_score(y_test,y_pred).round(2))
# Classification Report
print('\n{}'.format(classification_report(y_test,y_pred)))
#evaluation(Confusion Metrix)
print("Confusion Matrix:\n", confusion_matrix(y_test,y_pred))
fig, ax = plt.subplots(figsize=(20, 15))
plt.imshow(
pca.components_.T,
cmap="Spectral",
vmin=-1,
vmax=1,
)
plt.yticks(range(len(X.columns)), X.columns)
plt.xticks(range(7), range(1, 8))
plt.xlabel("Principal Component")
plt.ylabel("Contribution")
plt.title(
"Contribution of Features to Components"
)
plt.colorbar()
fig, ax = plt.subplots(figsize=(20, 10))
pd.DataFrame(
pca.components_, columns=df.columns
).plot(kind="bar", ax=ax).legend(
bbox_to_anchor=(1, 1)
)
Support vector machines – one trained using raw data
Without HyperParameter Tuning : Cross Validation Score: 0.62 Accuracy: 0.64
With HyperParameter Tuning: Cross Validation Score: 0.96 Test Accuracy: 0.98
Support vector machines – one trained using Principal Components
Cross Validation Score: 0.92 Test Accuracy: 0.78
PCA being a statistical technique reduce the dimensionality of the data by the selecting the most important features that captures maximum information about the dataset, does the task here. Here we have reduced the dimension from 18 to 7 and selected those which explained 95% variance.
Based on Accuracy scores, Cross validation scores and other parameters such as Recall, Precision, SVC with HyperParameter Tuning using raw data is much better than one using Principal Components.